librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)
Show correlations between variables.
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
Pairs plot with present color coded.
Full model workflow with split, fit, predict and evaluate process steps.
Let’s setup a data frame with only the data we want to model by:
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19912
Let’s start as simply as possible with a linear model lm() on multiple predictors X to predict presence y using a simpler workflow.
Simpler workflow with only fit and predict of all data, i.e. no splitting.
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.28329 -0.09504 0.01992 0.13295 0.86890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.541635705 0.148412953 10.39 <0.0000000000000002 ***
## WC_alt -0.000507343 0.000009035 -56.16 <0.0000000000000002 ***
## WC_bio1 -0.081418278 0.001422555 -57.23 <0.0000000000000002 ***
## WC_bio2 -0.029554373 0.000979939 -30.16 <0.0000000000000002 ***
## ER_tri -0.002474917 0.000123228 -20.08 <0.0000000000000002 ***
## ER_topoWet -0.034489623 0.002817935 -12.24 <0.0000000000000002 ***
## lon -0.043608547 0.001170300 -37.26 <0.0000000000000002 ***
## lat -0.105980684 0.001060644 -99.92 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2465 on 19904 degrees of freedom
## Multiple R-squared: 0.7571, Adjusted R-squared: 0.757
## F-statistic: 8864 on 7 and 19904 DF, p-value: < 0.00000000000000022
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.5193748 1.3127056
range(y_true)
## [1] 0 1
The problem with these predictions is that it ranges outside the possible values of present 1 and absent 0. (Later we’ll deal with converting values within this range to either 1 or 0 by applying a cutoff value; i.e. any values > 0.5 become 1 and below become 0.)
To solve this problem of constraining the response term to being between the two possible values, i.e. the probability \(p\) of being one or the other possible \(y\) values, we’ll apply the logistic transformation on the response term.
\[ logit(p_i) = \log_{e}\left( \frac{p_i}{1-p_i} \right) \]
We can expand the expansion of the predicted term, i.e. the probability \(p\) of being either \(y\), with all possible predictors \(X\) whereby each coeefficient \(b\) gets multiplied by the value of \(x\):
\[ \log_{e}\left( \frac{p_i}{1-p_i} \right) = b_0 + b_1 x_{1,i} + b_2 x_{2,i} + \cdots + b_k x_{k,i} \]
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.4334 -0.0049 0.0000 0.1261 3.2228
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -169.1651358 6.1541922 -27.488 < 0.0000000000000002 ***
## WC_alt -0.0016905 0.0002623 -6.446 0.000000000115 ***
## WC_bio1 0.5523077 0.0428240 12.897 < 0.0000000000000002 ***
## WC_bio2 -0.8249942 0.0280555 -29.406 < 0.0000000000000002 ***
## ER_tri -0.0277293 0.0032753 -8.466 < 0.0000000000000002 ***
## ER_topoWet -0.7398075 0.0554387 -13.345 < 0.0000000000000002 ***
## lon -1.8891459 0.0557322 -33.897 < 0.0000000000000002 ***
## lat -1.2155325 0.0311678 -39.000 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27603.6 on 19911 degrees of freedom
## Residual deviance: 4549.7 on 19904 degrees of freedom
## AIC: 4565.7
##
## Number of Fisher Scoring iterations: 9
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.0000000000000002220446 0.9999712687694529700266
Excellent, our response is now constrained between 0 and 1. Next, let’s look at the term plots to see the relationship between predictor and response.
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")
With a generalized additive model we can add “wiggle” to the relationship between predictor and response by introducing smooth s() terms.
librarian::shelf(mgcv)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -33.22 29.62 -1.122 0.262
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 3.135 3.643 68.83 < 0.0000000000000002 ***
## s(WC_bio1) 8.052 8.137 45.45 < 0.0000000000000002 ***
## s(WC_bio2) 7.915 8.314 196.31 < 0.0000000000000002 ***
## s(ER_tri) 6.216 7.199 24.10 0.00123 **
## s(ER_topoWet) 5.481 6.669 30.49 0.0000399 ***
## s(lon) 7.751 7.989 179.68 < 0.0000000000000002 ***
## s(lat) 8.907 8.995 354.86 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.917 Deviance explained = 89.1%
## UBRE = -0.84373 Scale est. = 1 n = 19912
# show term plots
plot(mdl, scale=0)
Maxent is probably the most commonly used species distribution model (Elith 2011) since it performs well with few input data points, only requires presence points (and samples background for comparison) and is easy to use with a Java graphical user interface (GUI).
# load extra packages
librarian::shelf(
maptools, sf)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")
# show version of maxent
if (!interactive())
maxent()
## Loading required namespace: rJava
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
mdl <- maxent(env_stack, obs_sp)
readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)
# plot term plots
response(mdl)
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')
Notice how the plot() function produces different outputs depending on the class of the input object. You can view help for each of these with R Console commands: ?plot.lm, ?plot.gam and .plot,DistModel,numeric-method